output.var = params$output.var
transform.abs = params$transform.abs
log.pred = params$log.pred
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 2
## $ output.var: chr "y3"
## $ log.pred : logi TRUE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
# - if predicting on log, then alt.scale is normal scale
# - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
output.var.tr = paste0(output.var,'.log')
}
if (log.pred == FALSE){
output.var.tr = output.var
}
feat = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')
cc = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980
The Output Variable y3 shows right skewness, so will proceed with a log transofrmation
ggplot(gather(select_at(data,output.var)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
ggplot(gather(select_at(data,output.var)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data[[output.var]])
t
## Best Normalizing transformation with 6980 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 3.0147
## - Box-Cox: 1.4848
## - Log_b(x+a): 2.0717
## - sqrt(x+a): 2.492
## - exp(x): 748.8387
## - arcsinh(x): 2.0717
## - Yeo-Johnson: 1.2598
## - orderNorm: 1.2159
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6980 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 95.913 118.289 124.030 131.059 193.726
qqnorm(data[[output.var]])
qqnorm(predict(t))
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
data[[output.var.tr]] = data[[output.var]]
ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transromation can help (see [Best Normalizator] section)
Histogram and QQ plot
cols = c('x11','x18','stat98','x7')
ggplot(gather(select_at(data,cols)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=2)
ggplot(gather(select_at(data,cols)), aes(sample=value)) +
stat_qq()+
facet_wrap(~key, scales = 'free',ncol=2)
lapply(select_at(data,cols),summary)
## $x11
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.000e-08 9.494e-08 1.001e-07 1.001e-07 1.052e-07 1.100e-07
##
## $x18
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.500 3.147 4.769 4.772 6.418 7.999
##
## $stat98
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.998619 -1.551882 -0.015993 -0.005946 1.528405 2.999499
##
## $x7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.700 1.266 1.854 1.852 2.446 3.000
Scatter plot vs. output variable **y3.log
d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light green',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
All indicators have a strong indication of Fat-Tails
ggplot(gather(select_at(data,predictors)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
ggplot(gather(select_at(data,predictors)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
#chart.Correlation(select(data,-JobName), pch=21)
t=round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName')),select_at(data,output.var.tr)),4)
DT::datatable(t)
#chart.Correlation(select(data,-JobName), pch=21)
t=round(cor(dplyr::select(data,-one_of('JobName'))),4)
DT::datatable(t,options=list(scrollX=T))
Scatter plots with all predictors and the output variable (y3.log)
d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
No Multicollinearity among predictors
Showing Top predictor by VIF Value
vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,10)
## Variables VIF
## 1 stat100 1.060877
## 2 stat209 1.060553
## 3 stat20 1.060198
## 4 stat178 1.060048
## 5 stat141 1.059520
## 6 stat207 1.057697
## 7 stat87 1.057394
## 8 stat154 1.057268
## 9 stat135 1.057038
## 10 stat104 1.057036
No trasnformation for x18
log transformatio for y3
data.tr=data %>%
mutate(x18.sqrt = sqrt(x18))
cols=c('x18','x18.sqrt')
ggplot(gather(select_at(data.tr,cols)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#removing unwanted variables
data.tr=data.tr %>%
dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18sqrt','y3')])
the target ariable y3 can be LOG transformed
the predictor x18 is not improving with SQR trasformatioatn
all predictors could benefit with a orderNorm transformation